Replicating pipeline and results described in Galbraith et al. (2016). PNAS. https://www.pnas.org/content/113/4/1020



Overview of pipeline



Requirements


if(suppressWarnings(require("purrr"))==F){
  install.packages("purrr")
  library(purrr)
}

if(suppressWarnings(require("tidyverse"))==F){
  install.packages("tidyverse")
  library(tidyverse)
}

if(suppressWarnings(require("plyr"))==F){
  install.packages("plyr")
  library(plyr)
}

if(suppressWarnings(require("ggplot2"))==F){
  install.packages("ggplot2")
  library(ggplot2)
}

if(suppressWarnings(require("Rfast"))==F){
  install.packages("Rfast")
  library(Rfast)
}

if(suppressWarnings(require("tryCatchLog"))==F){
  install.packages("tryCatchLog")
  library(tryCatchLog)
}

if(suppressWarnings(require("lme4"))==F){
  install.packages("lme4")
  library(lme4)
}

if(suppressWarnings(require("car"))==F){
  install.packages("car")
  library(car)
}

if(suppressWarnings(require("viridis"))==F){
  install.packages("viridis")
  library(viridis)
}

if(suppressWarnings(require("gridExtra"))==F){
  install.packages("gridExtra")
  library(gridExtra)
}

if(suppressWarnings(require("kableExtra"))==F){
  install.packages("kableExtra")
  library(kableExtra)
}


Setup


Sample metadata

metadata <- read.csv("metadata_OD.csv")
kbl(metadata) %>% kable_styling()
sample.id parent cross phenotype lineage
X8751ID 875D A sterile EHB
X8756ID 875D A sterile EHB
X875aID 875D A sterile EHB
X8827ID 882D B sterile EHB
X882gID 882D B sterile EHB
X882kID 882D B sterile EHB
X882mID 882D B sterile EHB
X888eID 888D A sterile AHB
X888qID 888D A sterile AHB
X888uID 888D A sterile AHB
X894cID 894D B sterile AHB
X894fID 894D B sterile AHB
X894iID 894D B sterile AHB
X8751IQ 875Q A sterile EHB
X8756IQ 875Q A sterile EHB
X875aIQ 875Q A sterile EHB
X8827IQ 882Q B sterile EHB
X882giQ 882Q B sterile EHB
X882kIQ 882Q B sterile EHB
X882mIQ 882Q B sterile EHB
X888eIQ 888Q A sterile AHB
X888qIQ 888Q A sterile AHB
X888uIQ 888Q A sterile AHB
X894cIQ 894Q B sterile AHB
X894fIQ 894Q B sterile AHB
X894iIQ 894Q B sterile AHB
X8754AD 875D A reproductive EHB
X875cAD 875D A reproductive EHB
X875gAD 875D A reproductive EHB
X8820AD 882D B reproductive EHB
X882cAD 882D B reproductive EHB
X882pAD 882D B reproductive EHB
X888aAD 888D A reproductive AHB
X888hAD 888D A reproductive AHB
X888jAD 888D A reproductive AHB
X894bAD 894D B reproductive AHB
X894qAD 894D B reproductive AHB
X894rAD 894D B reproductive AHB
X8754AQ 875Q A reproductive EHB
X875cAQ 875Q A reproductive EHB
X875gAQ 875Q A reproductive EHB
X8820AQ 882Q B reproductive EHB
X882caQ 882Q B reproductive EHB
X882pAQ 882Q B reproductive EHB
X888aAQ 888Q A reproductive AHB
X888hAQ 888Q A reproductive AHB
X888jAQ 888Q A reproductive AHB
X894bAQ 894Q B reproductive AHB
X894qAQ 894Q B reproductive AHB
X894rAQ 894Q B reproductive AHB

Generate SNP:gene count matrices


Generate more convenient TSV from SNP:gene bed file

  1. Load SNPs_for_analysis.bed, keep columns 1 (chromosome), 2 (position), and 4 (SNP:gene).
  2. Split column 3 by “:”, create new columns 4 (SNP ID) and 5 (gene ID).
  3. Save as TSV.
SNPs <- read.table("sNPs_for_analysis_sorted.bed",sep="\t",header=F)[,c(1,2,4)]
names(SNPs) <- c("chr","pos","SNP_gene")
SNPs$SNP <- as.character(map(strsplit(SNPs$SNP_gene, split = ":"), 1))
SNPs$geneID <- as.character(map(strsplit(SNPs$SNP_gene, split = ":"), 2))
write.table(SNPs,"SNP_gene_overlaps.txt",sep="\t",quote=F,row.names=F)
kbl(head(SNPs)) %>% kable_styling()
SNPs <- read.table("SNP_gene_overlaps.txt",sep="\t",header=1)

Set up count matrix

  1. Create empty data frame SNP_counts with as many rows as SNP:genes.
  2. Create a list of all *.txt files in the counts_SNPs directory.
SNP_counts <- data.frame(matrix(ncol=0,nrow=length(SNPs$SNP_gene)))
SNP_counts$SNP_gene <- SNPs$SNP_gene

files.counts <- list.files(path="counts_tophat2", 
                           pattern="*.txt", 
                           full.names=TRUE, 
                           recursive=FALSE)
head(files.counts)
  1. For each file listed in files.counts:
  • Read in the file to tmp, keep columns 4 (SNP:gene) and 7 (counts).
  • Store the sample ID (lineParent_SRA) from the file name to tmp.name.
  • Set column names of tmp to column 1 (SNP:gene) and column 2 (tmp.name).
  • Left join tmp to SNP_counts.
  1. Set the row names of SNP_counts to the SNP:gene IDs stored in column 1 (SNP:gene).
  2. Remove column 1 (SNP:gene).
  3. Fill empty cells with 0.
  4. Save as CSV.
for(i in 1:length(files.counts)){
  tmp <- read.table(files.counts[[i]],header=F)[,c(4,7)]
  tmp.name <-  strsplit(strsplit(files.counts[[i]], 
                                 split = "/")[[1]][2],
                        split = "[.]")[[1]][1]
  names(tmp) <- c("SNP_gene",tmp.name)
  SNP_counts <- SNP_counts %>% 
    left_join(tmp, by = c('SNP_gene' = 'SNP_gene')) 
}

row.names(SNP_counts) <- SNP_counts$SNP_gene
SNP_counts$SNP_gene <- NULL

SNP_counts[is.na(SNP_counts)] <- 0

write.csv(SNP_counts,"SNP_gene_counts.csv")

kbl(head(SNP_counts[,c(1:5)])) %>% kable_styling()
sterile_counts <- read.csv("GSE76164_PSGE_sterile.csv")
sterile_counts <- sterile_counts[!sterile_counts$ID==".",]
row.names(sterile_counts) <- paste(sterile_counts$ID,sterile_counts$SNP.pos,sep=":")

reproductive_counts <- read.csv("GSE76164_PSGE_reproductive.csv")
reproductive_counts <- reproductive_counts[!reproductive_counts$ID==".",]
row.names(reproductive_counts) <- paste(reproductive_counts$ID,reproductive_counts$SNP.pos,sep=":")


Preprocess F1 count matrices


Filter low-count SNPs

Remove rows where any column has <=lcf reads by cross.

lcf <- 5

sterile_counts <- sterile_counts[rowSums(sterile_counts[,4:29])>lcf*(length(names(sterile_counts[,4:29]))/2),]
reproductive_counts <- reproductive_counts[rowSums(reproductive_counts[,4:27])>lcf*(length(names(reproductive_counts[,4:27]))/2),]

l875.s <- metadata[metadata$parent%in%c("875D","875Q")&metadata$phenotype=="sterile","sample.id"]
l888.s <- metadata[metadata$parent%in%c("888D","888Q")&metadata$phenotype=="sterile","sample.id"]
l882.s <- metadata[metadata$parent%in%c("882D","882Q")&metadata$phenotype=="sterile","sample.id"]
l894.s <- metadata[metadata$parent%in%c("894D","894Q")&metadata$phenotype=="sterile","sample.id"]

sterile_counts <- sterile_counts[rowSums(sterile_counts[,l875.s])>lcf,]
sterile_counts <- sterile_counts[rowSums(sterile_counts[,l888.s])>lcf,]
sterile_counts <- sterile_counts[rowSums(sterile_counts[,l882.s])>lcf,]
sterile_counts <- sterile_counts[rowSums(sterile_counts[,l894.s])>lcf,]

l875.r <- metadata[metadata$parent%in%c("875D","875Q")&metadata$phenotype=="reproductive","sample.id"]
l888.r <- metadata[metadata$parent%in%c("888D","888Q")&metadata$phenotype=="reproductive","sample.id"]
l882.r <- metadata[metadata$parent%in%c("882D","882Q")&metadata$phenotype=="reproductive","sample.id"]
l894.r <- metadata[metadata$parent%in%c("894D","894Q")&metadata$phenotype=="reproductive","sample.id"]

reproductive_counts <- reproductive_counts[rowSums(reproductive_counts[,l875.r])>lcf,]
reproductive_counts <- reproductive_counts[rowSums(reproductive_counts[,l888.r])>lcf,]
reproductive_counts <- reproductive_counts[rowSums(reproductive_counts[,l882.r])>lcf,]
reproductive_counts <- reproductive_counts[rowSums(reproductive_counts[,l894.r])>lcf,]

### Remove rows with greater than 10000 counts (cannot run SK test on these)
reproductive_counts$SUM <- rowSums(reproductive_counts[,4:27])
reproductive_counts <- reproductive_counts[reproductive_counts$SUM<10000,]
reproductive_counts$SUM <- NULL

sterile_counts$SUM <- rowSums(sterile_counts[,4:29])
sterile_counts <- sterile_counts[sterile_counts$SUM<10000,]
sterile_counts$SUM <- NULL
### Remove rows with greater than 10000 counts (cannot run SK test on these)
Reproductive SNPs
length(row.names(reproductive_counts))
## [1] 36216
Sterile SNPs
length(row.names(sterile_counts))
## [1] 36683

Split count matrices by lineage and parent of origin

p pat/mat lineage
p1 pat AHB
p1 mat EHB
p2 pat EHB
p2 mat AHB
# Sterile
## p1
p1.sterile.pat <- sterile_counts[,metadata[metadata$parent%in%c("875D","882D")&
                                         metadata$phenotype=="sterile","sample.id"]]
p1.sterile.mat <- sterile_counts[,metadata[metadata$parent%in%c("875Q","882Q")&
                                         metadata$phenotype=="sterile","sample.id"]]
names(p1.sterile.pat) <- names(p1.sterile.mat)
## p2
p2.sterile.pat <- sterile_counts[,metadata[metadata$parent%in%c("888D","894D")&
                                         metadata$phenotype=="sterile","sample.id"]]
p2.sterile.mat <- sterile_counts[,metadata[metadata$parent%in%c("888Q","894Q")&
                                         metadata$phenotype=="sterile","sample.id"]]
names(p2.sterile.pat) <- names(p2.sterile.mat)

# Reproductive
## p1
p1.reproductive.pat <- reproductive_counts[,metadata[metadata$parent%in%c("875D","882D")&
                                                       metadata$phenotype=="reproductive","sample.id"]]
p1.reproductive.mat <- reproductive_counts[,metadata[metadata$parent%in%c("875Q","882Q")&
                                                       metadata$phenotype=="reproductive","sample.id"]]
names(p1.reproductive.pat) <- names(p1.reproductive.mat)
## p2
p2.reproductive.pat <- reproductive_counts[,metadata[metadata$parent%in%c("888D","894D")&
                                                       metadata$phenotype=="reproductive","sample.id"]]
p2.reproductive.mat <- reproductive_counts[,metadata[metadata$parent%in%c("888Q","894Q")&
                                                       metadata$phenotype=="reproductive","sample.id"]]
names(p2.reproductive.pat) <- names(p2.reproductive.mat)


Functions for statistical tests


Storer-Kim test [twobinom]

(Function from the WRS2 package). Test the hypothesis that two independent binomials have equal probability of success using the Storer–Kim method.

twobinom<-function(r1=sum(elimna(x)),n1=length(x),
                   r2=sum(elimna(y)),n2=length(y),
                   x=NA,y=NA,alpha=.05){
  # Modified for efficiency: changed outer() command to Rfast::Outer()
  # Requires Rfast & tryCatchLog packages!
  # r1=number of successes in group 1
  # r2=number of successes in group 2
  # n1=number of observations in group 1
  # n2=number of observations in group 2
  n1p<-n1+1
  n2p<-n2+1
  n1m<-n1-1
  n2m<-n2-1
  chk<-abs(r1/n1-r2/n2)
  x<-c(0:n1)/n1
  y<-c(0:n2)/n2
  phat<-(r1+r2)/(n1+n2)
  m1<-t(Outer(x,y,"-"))
  m2<-matrix(1,n1p,n2p)
  flag<-(abs(m1)>=chk)
  m3<-m2*flag
  rm(m1,m2,flag)
  b1<-1
  b2<-1
  xv<-c(1:n1)
  yv<-c(1:n2)
  xv1<-n1-xv+1
  yv1<-n2-yv+1
  dis1<-c(1,pbeta(phat,xv,xv1))
  dis2<-c(1,pbeta(phat,yv,yv1))
  pd1<-NA
  pd2<-NA
  for(i in 1:n1){pd1[i]<-dis1[i]-dis1[i+1]}
  for(i in 1:n2){pd2[i]<-dis2[i]-dis2[i+1]}
  pd1[n1p]<-phat^n1
  pd2[n2p]<-phat^n2
  m4<-t(Outer(pd1,pd2,"*"))
  test<-sum(m3*m4)
  rm(m3,m4)
  list(p.value=test,p1=r1/n1,p2=r2/n2,est.dif=r1/n1-r2/n2)
}

Wrapper for Storer-Kim test [test.df]

  1. Calculate %p1 and %p2 for each SNP.
  • %p1 = sum(AHB in EHBxAHB)/(AHB in EHBxAHB + EHB in EHBxAHB).
  • %p2 = sum(AHB in AHBxEHB)/(AHB in AHBxEHB + EHB in AHBxEHB).
  1. If either %p1 or %p2 is NA (due to divide by 0), set to 0.
  2. Compute Storer-Kim test p-value for each SNP.
  3. If test failed, set p-value to 1.
  4. For each possible parent and lineage bias, if %p1 and %p2 pass defined thresholds [high and low], set the associated test p-value to the Storer-Kim test p-value; otherwise, set to 1.
  5. Return a data.frame where each row corresponds to one SNP and the associated parent and lineage bias p-values.
test.df <- function(p1.pat.df,p1.mat.df,
                    p2.pat.df,p2.mat.df,high,low,
                    stest=c("CS","SK","thresholds"),
                    startrow=1,endrow="n"){
  # Requires Rfast & tryCatchLog packages!
  if(endrow=="n"){endrow <- length(row.names(p1.pat.df))}
  return.list <- list()
  p1.pat.df <- p1.pat.df[startrow:endrow,]
  p1.mat.df <- p1.mat.df[startrow:endrow,]
  p2.pat.df <- p2.pat.df[startrow:endrow,]
  p2.mat.df <- p2.mat.df[startrow:endrow,]
  i.len <- length(row.names(p1.pat.df))
  for(i in 1:i.len){
    print(i)
    p1.pat <- p1.pat.df[i,]
    p1.mat <- p1.mat.df[i,]
    p2.pat <- p2.pat.df[i,]
    p2.mat <- p2.mat.df[i,]
    p1.s <- sum(p1.pat)
    p1.o <- sum(p1.pat,p1.mat)
    p1 <- p1.s/p1.o
    if(is.na(p1)){p1 <- 0}
    p2.s <- sum(p2.mat)
    p2.o <- sum(p2.pat,p2.mat)
    p2 <- p2.s/p2.o
    if(is.na(p2)){p2 <- 0}
    
    if(stest=="SK"){
      tryCatchLog::tryCatchLog(test <- twobinom(r1=p1.s,n1=p1.o,r2=p2.s,n2=p2.o)$p.value,
                               error = function(e) {test <- "MEM.EXHAUST"})
    }
    
    if(stest=="CS"){
      test <- chisq.test(data.frame(X=c(p1.s,p2.s),Y=c(p1.o,p2.o)))$p.value
    }
    
    if(stest=="thresholds"){
      test <- 0
    }
    
    if(is.nan(test)){test <- 1}
    if(p1>high&p2<low){pat.test <- test}else{pat.test <- 1}
    if(p1<low&p2>high){mat.test <- test}else{mat.test <- 1}
    if(p1<low&p2<low){EHB.test <- test}else{EHB.test <- 1}
    if(p1>high&p2>high){AHB.test <- test}else{AHB.test <- 1}
    
    return.df <- data.frame(SNP_gene=row.names(p1.pat.df[i,]),
                            pat.p=pat.test,mat.p=mat.test,
                            AHB.p=AHB.test,EHB.p=EHB.test)
    
    return.list[[i]] <- return.df
  }
  return(bind_rows(return.list))
}

General linear mixed effects model [GLIMMIX]

GLIMMIX <- function(counts,metadata){
  # Requires tryCatchLog package!
  counts$SNP_gene <- row.names(counts)
  parent.list <- list()
  parent.p.list <- list()
  lineage.list <- list()
  lineage.p.list <- list()
  counts$geneID <- as.character(unlist(map(strsplit(counts$SNP_gene, split = ":"), 1)))
  genelist <- unique(counts$geneID)
  counts <- counts[,-c(1:3)]
  for(i in 1:length(genelist)){
    print(paste("Row ",i," of ",length(genelist),sep=""))
    counts.sub <- counts[counts$geneID==genelist[i],]
    counts.sub$geneID <- NULL
    counts.sub <- gather(counts.sub, sample.id, count, 
                         names(counts.sub), -SNP_gene, factor_key=TRUE)
    counts.sub <- join(counts.sub, metadata, by = "sample.id")[,-c(5:6)]
    counts.sub$parent <- substr(counts.sub$parent,4,4)
    counts.sub$parent <- as.factor(counts.sub$parent)
    counts.sub$SNP_gene <- as.factor(counts.sub$SNP_gene)
    counts.sub$lineage <- as.factor(counts.sub$lineage)
    counts.sub$sample.id <- as.factor(counts.sub$sample.id)
    
    testfail <- F
    test <- "null"
    
    if(length(levels(counts.sub$SNP_gene))>1){
      tryCatchLog(test <- lmer(count~parent+lineage+parent*lineage+(1|SNP_gene)+(1|sample.id),
                               data=counts.sub),
                  error = function(e) {testfail <- T})
    }else{
      tryCatchLog(test <- lmer(count~parent+lineage+parent*lineage+(1|sample.id),
                               data=counts.sub),
                  error = function(e) {testfail <- T})
    }
    
    if(class(test)=="character"){testfail <- T}
    
    if(testfail==F){
      test.p <- Anova(test)
      test <- summary(test)
      parent.list[i] <- test[["coefficients"]][2,1]
      parent.p.list[i] <- test.p[1,3]
      lineage.list[i] <- test[["coefficients"]][3,1]
      lineage.p.list[i] <- test.p[2,3]
    }else{
      parent.list[i] <- 0
      parent.p.list[i] <- 1
      lineage.list[i] <- 0
      lineage.p.list[i] <- 1
    }
    cat("\014") 
  }
  return(data.frame(ID=genelist,
                    parent.dirpat.est=unlist(parent.list),
                    parent.dirpat.p=unlist(parent.p.list),
                    lineageEHB.est=unlist(lineage.list),
                    lineageEHB.p=unlist(lineage.p.list)))
}


Conduct statistical tests


sterile.SK <- test.df(p1.sterile.pat,p1.sterile.mat,
                      p2.sterile.pat,p2.sterile.mat,
                      low=0.4,high=0.6,"SK")
write.csv(sterile.SK,"sterile_SK.csv")

sterile.GLIMMIX <- GLIMMIX(sterile_counts,metadata)
write.csv(sterile.GLIMMIX,"sterile_GLIMMIX.csv")


reproductive.SK <- test.df(p1.reproductive.pat,p1.reproductive.mat,
                           p2.reproductive.pat,p2.reproductive.mat,
                           low=0.4,high=0.6,"SK")
write.csv(reproductive.SK,"reproductive_SK.csv")

reproductive.GLIMMIX <- GLIMMIX(reproductive_counts,metadata)
write.csv(reproductive.GLIMMIX,"reproductive_GLIMMIX.csv")


Plot results


Description under construction.

Sterile F1s


Set up a data.frame to plot %p1 and %p2 for each SNP

## p1 = sum(A in ExA)/(A in ExA + E in ExA)
p1.sterile.plot <- data.frame(rowSums(p1.sterile.pat)/(rowSums(p1.sterile.mat)+
                                                         rowSums(p1.sterile.pat)))
names(p1.sterile.plot) <- c("p1")

## p2 = sum(A in AxE)/(A in AxE + E in AxE)
p2.sterile.plot <- data.frame(rowSums(p2.sterile.mat)/(rowSums(p2.sterile.mat)+
                                                         rowSums(p2.sterile.pat)))
names(p2.sterile.plot) <- c("p2")

Join results of Storer-Kim tests

sterile.plot <- cbind(p1.sterile.plot,p2.sterile.plot)
sterile.plot <- sterile.plot[row.names(sterile.plot)%in%sterile.SK$SNP_gene,]
sterile.plot <- cbind(sterile.plot,sterile.SK[,c(2:5)])
sterile.plot$SNP_gene <- row.names(sterile.plot)
sterile.plot$gene <- as.character(map(strsplit(sterile.plot$SNP_gene, split = ":"), 1))

For each gene, check whether all SNPs are biased in the same direction and overlap with GLIMMIX test results

sterile.plot$bias <- "NA"
sterile.plot[sterile.plot$pat.p<0.05,"bias"] <- "pat"
sterile.plot[sterile.plot$mat.p<0.05,"bias"] <- "mat"
sterile.plot[sterile.plot$EHB.p<0.05,"bias"] <- "EHB"
sterile.plot[sterile.plot$AHB.p<0.05,"bias"] <- "AHB"
biaslist <- data.frame(matrix(ncol=2,nrow=0))
names(biaslist) <- c("gene","bias")
genelist <- unique(sterile.plot$gene)
for(i in 1:length(genelist)){
 tmp <- unique(sterile.plot[sterile.plot$gene==genelist[i],"bias"])
 if(length(tmp)>1){
    if(length(tmp)==2){
     if(any(tmp%in%"NA")){
       bias <- tmp[!tmp%in%"NA"]
     }
    }else{
     bias <- "NA"
   }
 }else{bias <- tmp}
   biaslist <- rbind(biaslist,data.frame(gene=genelist[[i]], bias=bias))
}
sterile.plot <- sterile.plot %>% 
  left_join(biaslist, by = c('gene' = 'gene')) 
names(sterile.plot) <- c("p1","p2","pat.p","mat.p",
                         "AHB.p","EHB.p","SNP_gene","gene","xbias","bias")

bias.plot <- list()
for(i in 1:length(row.names(sterile.plot))){
  if(!sterile.plot[i,"bias"]=="NA"){
    bias.plot[i] <- "NA"
    if(sterile.plot[i,"bias"]=="EHB"&sterile.plot[i,"p1"]<0.4&sterile.plot[i,"p2"]<0.4){
      bias.plot[i] <- "EHB"
    }
    if(sterile.plot[i,"bias"]=="mat"&sterile.plot[i,"p1"]<0.4&sterile.plot[i,"p2"]>0.6){
      bias.plot[i] <- "mat"
    }
    if(sterile.plot[i,"bias"]=="AHB"&sterile.plot[i,"p1"]>0.6&sterile.plot[i,"p2"]>0.6){
      bias.plot[i] <- "AHB"
    }
    if(sterile.plot[i,"bias"]=="pat"&sterile.plot[i,"p1"]>0.6&sterile.plot[i,"p2"]<0.4){
      bias.plot[i] <- "pat"
    }
  }else{
    bias.plot[i] <- "NA"
  }
}

sterile.plot$bias.plot <- unlist(bias.plot)
sterile.plot <- rbind(sterile.plot[sterile.plot$bias.plot%in%c("NA"),],
                      sterile.plot[sterile.plot$bias.plot%in%c("mat", "AHB", "EHB", "pat"),])
sterile.plot$bias.plot <- factor(sterile.plot$bias.plot,
                                 levels = c("NA","mat", "AHB", "EHB", "pat"))
sterile.GLIMMIX.biased <- sterile.GLIMMIX[sterile.GLIMMIX$parent.dirpat.p<0.05|sterile.GLIMMIX$lineageEHB.p<0.05,1]
sterile.plot[!sterile.plot$gene%in%sterile.GLIMMIX.biased,"bias.plot"] <- "NA" 

Generate plot for sterile F1s

g1 <- ggplot(sterile.plot, aes(x=p1, y=p2,
                                    color=bias.plot,alpha=0.8)) + 
  geom_point() + theme_classic() +
  xlab(expression(paste("% A allele in ",E[mother],
                        " x ",A[father],sep=""))) +
  ylab(expression(paste("% A allele in ",A[mother],
                        " x ",E[father],sep=""))) +
  ggtitle("sterile workers") +
  theme(text = element_text(size=20),
        plot.title = element_text(hjust = 0.5)) +
  scale_color_manual(breaks = levels(sterile.plot$bias)[-c(1)],
                     values=c("grey", magma(20)[c(2)], 
                              magma(20)[c(8)], magma(20)[c(12)], 
                              magma(20)[c(16)]))+
  guides(alpha=F, color=F)
g1

Reproductive F1s


Set up a data.frame to plot %p1 and %p2 for each SNP

## p1 = sum(A in ExA)/(A in ExA + E in ExA)
p1.reproductive.plot <- data.frame(rowSums(p1.reproductive.pat)/(rowSums(p1.reproductive.mat)+
                                                                   rowSums(p1.reproductive.pat)))
names(p1.reproductive.plot) <- c("p1")

## p2 = sum(A in AxE)/(A in AxE + E in AxE)
p2.reproductive.plot <- data.frame(rowSums(p2.reproductive.mat)/(rowSums(p2.reproductive.mat)+
                                                                   rowSums(p2.reproductive.pat)))
names(p2.reproductive.plot) <- c("p2")

Join results of Storer-Kim tests

reproductive.plot <- cbind(p1.reproductive.plot,p2.reproductive.plot)
reproductive.plot <- reproductive.plot[row.names(reproductive.plot)%in%reproductive.SK$SNP_gene,]
reproductive.plot <- cbind(reproductive.plot,reproductive.SK[,c(2:5)])
reproductive.plot$SNP_gene <- row.names(reproductive.plot)
reproductive.plot$gene <- as.character(map(strsplit(reproductive.plot$SNP_gene, split = ":"), 1))

For each gene, check whether all SNPs are biased in the same direction and overlap with GLIMMIX test results

reproductive.plot$bias <- "NA"
reproductive.plot[reproductive.plot$pat.p<0.05,"bias"] <- "pat"
reproductive.plot[reproductive.plot$mat.p<0.05,"bias"] <- "mat"
reproductive.plot[reproductive.plot$EHB.p<0.05,"bias"] <- "EHB"
reproductive.plot[reproductive.plot$AHB.p<0.05,"bias"] <- "AHB"
biaslist <- data.frame(matrix(ncol=2,nrow=0))
names(biaslist) <- c("gene","bias")
genelist <- unique(reproductive.plot$gene)
for(i in 1:length(genelist)){
  tmp <- unique(reproductive.plot[reproductive.plot$gene==genelist[i],"bias"])
  if(length(tmp)>1){
     if(length(tmp)==2){
      if(any(tmp%in%"NA")){
        bias <- tmp[!tmp%in%"NA"]
      }
     }else{
      bias <- "NA"
     }
  }else{bias <- tmp}
  biaslist <- rbind(biaslist,data.frame(gene=genelist[[i]], bias=bias))
}
reproductive.plot <- reproductive.plot %>% 
  left_join(biaslist, by = c('gene' = 'gene')) 
names(reproductive.plot) <- c("p1","p2","pat.p","mat.p",
                              "AHB.p","EHB.p","SNP_gene","gene","xbias","bias")

bias.plot <- list()
for(i in 1:length(row.names(reproductive.plot))){
  if(!reproductive.plot[i,"bias"]=="NA"){
    bias.plot[i] <- "NA"
    if(reproductive.plot[i,"bias"]=="EHB"&reproductive.plot[i,"p1"]<0.4&reproductive.plot[i,"p2"]<0.4){
      bias.plot[i] <- "EHB"
    }
    if(reproductive.plot[i,"bias"]=="mat"&reproductive.plot[i,"p1"]<0.4&reproductive.plot[i,"p2"]>0.6){
      bias.plot[i] <- "mat"
    }
    if(reproductive.plot[i,"bias"]=="AHB"&reproductive.plot[i,"p1"]>0.6&reproductive.plot[i,"p2"]>0.6){
      bias.plot[i] <- "AHB"
    }
    if(reproductive.plot[i,"bias"]=="pat"&reproductive.plot[i,"p1"]>0.6&reproductive.plot[i,"p2"]<0.4){
      bias.plot[i] <- "pat"
    }
  }else{
    bias.plot[i] <- "NA"
  }
}

reproductive.plot$bias.plot <- unlist(bias.plot)
reproductive.plot <- rbind(reproductive.plot[reproductive.plot$bias.plot%in%c("NA"),],
                           reproductive.plot[reproductive.plot$bias.plot%in%c("mat", "AHB", "EHB", "pat"),])
reproductive.plot$bias.plot <- factor(reproductive.plot$bias.plot,
                                      levels = c("NA","mat", "AHB", "EHB", "pat"))
reproductive.GLIMMIX.biased <- reproductive.GLIMMIX[reproductive.GLIMMIX$parent.dirpat.p<0.05|reproductive.GLIMMIX$lineageEHB.p<0.05,1]
reproductive.plot[!reproductive.plot$gene%in%reproductive.GLIMMIX.biased,"bias.plot"] <- "NA" 

Generate plot for reproductive F1s

g2 <- ggplot(reproductive.plot, aes(x=p1, y=p2,
                                    color=bias.plot,alpha=0.8)) + 
  geom_point() + theme_classic() +
  xlab(expression(paste("% A allele in ",E[mother],
                        " x ",A[father],sep=""))) +
  ylab(expression(paste("% A allele in ",A[mother],
                        " x ",E[father],sep=""))) +
  ggtitle("reproductive workers") +
  theme(text = element_text(size=20),
        plot.title = element_text(hjust = 0.5)) +
  scale_color_manual(breaks = levels(reproductive.plot$bias)[-c(1)],
                     values=c("grey", magma(20)[c(2)], 
                              magma(20)[c(8)], magma(20)[c(12)], 
                              magma(20)[c(16)]))+
  guides(alpha=F, color=F)
g2

Final plot


Make center tri-plot

gmid.df <- data.frame(Sterile=c(length(unique(sterile.plot[sterile.plot$bias.plot=="mat","gene"])),
                                length(unique(sterile.plot[sterile.plot$bias.plot=="AHB","gene"])),
                                length(unique(sterile.plot[sterile.plot$bias.plot=="EHB","gene"])),
                                length(unique(sterile.plot[sterile.plot$bias.plot=="pat","gene"]))),
                      Bias=c("mat","AHB","EHB","pat"),
                      Reproductive=c(length(unique(reproductive.plot[reproductive.plot$bias.plot=="mat","gene"])),
                                     length(unique(reproductive.plot[reproductive.plot$bias.plot=="AHB","gene"])),
                                     length(unique(reproductive.plot[reproductive.plot$bias.plot=="EHB","gene"])),
                                     length(unique(reproductive.plot[reproductive.plot$bias.plot=="pat","gene"]))))

## Test if # of sterile biased genes is different from reproductive biased genes
mat.test <- chisq.test(data.frame(Success=c(gmid.df[1,1],gmid.df[1,3]),
                                         Failure=c(length(unique(sterile_counts$ID))-gmid.df[1,1],
                                                   length(unique(sterile_counts$ID))-gmid.df[1,3]),
                                         row.names=c("Sterile","Reproductive")))$p.value
AHB.test <- chisq.test(data.frame(Success=c(gmid.df[2,1],gmid.df[2,3]),
                                  Failure=c(length(unique(sterile_counts$ID))-gmid.df[2,1],
                                            length(unique(sterile_counts$ID))-gmid.df[2,3]),
                                  row.names=c("Sterile","Reproductive")))$p.value
## Warning in chisq.test(data.frame(Success = c(gmid.df[2, 1], gmid.df[2, 3]), :
## Chi-squared approximation may be incorrect
EHB.test <- chisq.test(data.frame(Success=c(gmid.df[3,1],gmid.df[3,3]),
                                  Failure=c(length(unique(sterile_counts$ID))-gmid.df[3,1],
                                            length(unique(sterile_counts$ID))-gmid.df[3,3]),
                                  row.names=c("Sterile","Reproductive")))$p.value
pat.test <- chisq.test(data.frame(Success=c(gmid.df[4,1],gmid.df[4,3]),
                                  Failure=c(length(unique(sterile_counts$ID))-gmid.df[4,1],
                                            length(unique(sterile_counts$ID))-gmid.df[4,3]),
                                  row.names=c("Sterile","Reproductive")))$p.value
## Build table
gmid.df$`.` <- c(mat.test,AHB.test,EHB.test,pat.test)

gmid.df <- gmid.df[,c(4,1,2,3)]
nsrows <- row.names(gmid.df[gmid.df$`.`>0.05,])
gmid.df$`.` <- formatC(gmid.df$`.`, format = "e", digits = 2)
gmid.df[nsrows,"."] <- "(ns)"

cols <- matrix("black", nrow(gmid.df), ncol(gmid.df))
cols[1,3] <- magma(20)[c(2)]
cols[2,3] <- magma(20)[c(8)]
cols[3,3] <- magma(20)[c(12)]
cols[4,3] <- magma(20)[c(16)]

ccols <- matrix("white", nrow(gmid.df), ncol(gmid.df))
ccols[1,3] <- "#e4d8d1"
ccols[2,3] <- "#e4d8d1"
ccols[3,3] <- "#e4d8d1"
ccols[4,3] <- "#e4d8d1"
ccols[1,2] <- "#f4efea"
ccols[2,2] <- "#f4efea"
ccols[3,2] <- "#f4efea"
ccols[4,2] <- "#f4efea"
ccols[1,4] <- "#f4efea"
ccols[2,4] <- "#f4efea"
ccols[3,4] <- "#f4efea"
ccols[4,4] <- "#f4efea"

cfonts <- matrix("plain", nrow(gmid.df), ncol(gmid.df))
cfonts[1,3] <- "bold"
cfonts[2,3] <- "bold"
cfonts[3,3] <- "bold"
cfonts[4,3] <- "bold"

tt <- ttheme_default(core=list(fg_params = list(col = cols, 
                                                cex = 1,
                                                fontface = cfonts),
                               bg_params = list(col=NA,
                                                fill = ccols),
                               padding.h=unit(2, "mm")),
                     rowhead=list(bg_params = list(col=NA)),
                     colhead=list(bg_params = list(fill = c("white",
                                                            "#f4efea",
                                                            "#e4d8d1",
                                                            "#f4efea")),
                                  fg_params = list(rot=90,
                                                   cex = 1,
                                                   col = c("white",
                                                           "black",
                                                           "black",
                                                           "black"))))

gmid <- tableGrob(gmid.df, rows = NULL, theme=tt)

plot(gmid)

Join plots to generate the final plot

fig1 <- arrangeGrob(g1, gmid, g2, widths=c(5,2.5,5))
ggsave(file="fig1.png", plot=fig1, width=18, height=6)